Hands-on Machine Learning with R
Boookclub R-Ladies Utrecht and R-Ladies Den Bosch
# Libraries needed
library(tidyverse)
library(kernlab) # fitting SVMs
library(mlbench) # ML benchmark data sets
# Simulate data
set.seed(0841)
spirals <- as.data.frame(
mlbench.spirals(300,
cycles = 2,
sd = 0.09))
names(spirals) <- c("x1", "x2", "classes")
head(spirals) x1 x2 classes
1 0.3894633 -0.01786672 1
2 0.2731469 0.04359156 1
3 0.3394452 0.05940869 1
4 0.1959808 0.09491952 1
5 0.2001946 -0.58237355 2
6 0.3331377 0.12047611 1
Support Vector Machine object of class "ksvm"
SV type: C-svc (classification)
parameter : cost C = 500
Gaussian Radial Basis kernel function.
Hyperparameter : sigma = 1.56383735596073
Number of Support Vectors : 82
Objective Function Value : -15568.76
Training error : 0.023333
Probability model included.
# Grid over which to evaluate decision boundaries
npts <- 500
xgrid <- expand.grid(
x1 = seq(from = -2, 2, length = npts),
x2 = seq(from = -2, 2, length = npts)
)
# Predicted probabilities (as a two-column matrix)
prob_svm <- predict(spirals_svm,
newdata = xgrid,
type = "probabilities")
xgrid2 <- bind_cols(xgrid, prob = prob_svm[,1])# Scatterplots with decision boundaries
ggplot(spirals, aes(x = x1, y = x2)) +
geom_point(aes(shape = classes,
color = classes),
size = 3, alpha = 0.75) +
xlab(expression(X[1])) +
ylab(expression(X[2])) +
xlim(-2, 2) +
ylim(-2, 2) +
coord_fixed() +
theme_bw(base_size = 20) +
theme(legend.position = "none") +
stat_contour(data = xgrid2,
aes(x = x1,
y = x2,
z = prob),
linewidth = 1,
breaks = 0.5,
color = "black")# Load attrition data
df <- modeldata::attrition %>%
#change all factors to unordered factors
mutate_if(is.ordered, factor, ordered = FALSE)
# Create training (70%) and test (30%) sets
set.seed(123) # for reproducibility
library(rsample)
churn_split <- initial_split(df, prop = 0.7, strata = "Attrition")
churn_train <- training(churn_split)
churn_test <- testing(churn_split)# Tune an SVM with radial basis kernel
library(caret)
set.seed(1854) # for reproducibility
churn_svm <- caret::train(
Attrition ~ .,
data = churn_train,
method = "svmRadial",
preProcess = c("center", "scale"),
trControl = trainControl(method = "cv",
number = 10),
tuneLength = 10
)
# different results than in book?
ggplot(churn_svm) +
geom_text(aes(label = C),
hjust = "inward",
nudge_y = -0.001) +
theme_light()# Control params for SVM
ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE,
summaryFunction = twoClassSummary ) # needed for AUC/ROC
# Tune an SVM
set.seed(5628) # for reproducibility
churn_svm_auc <- train(Attrition ~ .,
data = churn_train, method = "svmRadial",
preProcess = c("center", "scale"),
metric = "ROC", # area under ROC curve (AUC)
trControl = ctrl, tuneLength = 10)
churn_svm_auc$results %>% round(4) sigma C ROC Sens Spec ROCSD SensSD SpecSD
1 0.0094 0.25 0.8238 0.9641 0.3688 0.0589 0.0149 0.0838
2 0.0094 0.50 0.8240 0.9652 0.3816 0.0588 0.0173 0.0689
3 0.0094 1.00 0.8243 0.9652 0.3757 0.0586 0.0197 0.0946
4 0.0094 2.00 0.8271 0.9791 0.3504 0.0584 0.0092 0.1022
5 0.0094 4.00 0.8234 0.9826 0.3022 0.0583 0.0113 0.0826
6 0.0094 8.00 0.8122 0.9837 0.3129 0.0543 0.0098 0.1370
7 0.0094 16.00 0.7957 0.9837 0.2827 0.0532 0.0098 0.1288
8 0.0094 32.00 0.7864 0.9826 0.3018 0.0537 0.0147 0.1380
9 0.0094 64.00 0.7865 0.9861 0.2710 0.0540 0.0107 0.1104
10 0.0094 128.00 0.7865 0.9837 0.2776 0.0540 0.0098 0.1209
vip()# We want reference class "Yes"
# Make function returning prob of "Yes"
prob_yes <- function(object, newdata) {
predict(object, newdata = newdata, type = "prob")[, "Yes"]
}
set.seed(2827) # for reproducibility
vip::vip(churn_svm_auc,
method = "permute",
nsim = 5,
train = churn_train,
target = "Attrition",
metric = "auc",
reference_class = "Yes",
pred_wrapper = prob_yes) +
theme_bw(base_size = 12)ppdp <- function(x){pdp::partial(churn_svm_auc, pred.var = x, which.class = 2,
prob = TRUE, plot = TRUE, plot.engine = "ggplot2") +
coord_flip() + theme_bw(base_size = 12)
}
#library(patchwork)
ppdp("OverTime") + ppdp("WorkLifeBalance") + ppdp("JobSatisfaction") +
ppdp("JobRole") + plot_layout(ncol = 2)